loading packages

Loading both training and testing data

dataFrameTrainLoad <- read.csv("pml-training.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
dataFrameTestLoad <- read.csv("pml-testing.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
dim(dataFrameTrainLoad); dim(dataFrameTestLoad)
## [1] 19622   160
## [1]  20 160
informationOfTraining <- createDataPartition(dataFrameTrainLoad$classe, p = 0.8, list = F)
valueOfDifferent <- dataFrameTrainLoad[-informationOfTraining,]
dataFrameTrainLoad <- dataFrameTrainLoad[informationOfTraining,]
dim(dataFrameTrainLoad); dim(valueOfDifferent)
## [1] 15699   160
## [1] 3923  160
table(dataFrameTrainLoad$classe)/nrow(dataFrameTrainLoad)
## 
##         A         B         C         D         E 
## 0.2843493 0.1935155 0.1744060 0.1638958 0.1838334
missingBeltVariable <- sapply(select(dataFrameTrainLoad,names(dataFrameTrainLoad)[grepl("_belt",names(dataFrameTrainLoad))]),
                    function(x) sum(is.na(x)))
missingBeltVariable
##            roll_belt           pitch_belt             yaw_belt 
##                    0                    0                    0 
##     total_accel_belt   kurtosis_roll_belt  kurtosis_picth_belt 
##                    0                15382                15400 
##    kurtosis_yaw_belt   skewness_roll_belt skewness_roll_belt.1 
##                15699                15381                15400 
##    skewness_yaw_belt        max_roll_belt       max_picth_belt 
##                15699                15374                15374 
##         max_yaw_belt        min_roll_belt       min_pitch_belt 
##                15382                15374                15374 
##         min_yaw_belt  amplitude_roll_belt amplitude_pitch_belt 
##                15382                15374                15374 
##   amplitude_yaw_belt var_total_accel_belt        avg_roll_belt 
##                15382                15374                15374 
##     stddev_roll_belt        var_roll_belt       avg_pitch_belt 
##                15374                15374                15374 
##    stddev_pitch_belt       var_pitch_belt         avg_yaw_belt 
##                15374                15374                15374 
##      stddev_yaw_belt         var_yaw_belt         gyros_belt_x 
##                15374                15374                    0 
##         gyros_belt_y         gyros_belt_z         accel_belt_x 
##                    0                    0                    0 
##         accel_belt_y         accel_belt_z        magnet_belt_x 
##                    0                    0                    0 
##        magnet_belt_y        magnet_belt_z 
##                    0                    0
missingArmVariable <- sapply(select(dataFrameTrainLoad,names(dataFrameTrainLoad)[grepl("_arm",names(dataFrameTrainLoad))]),
                   function(x) sum(is.na(x)))
missingArmVariable
##            roll_arm           pitch_arm             yaw_arm     total_accel_arm 
##                   0                   0                   0                   0 
##       var_accel_arm        avg_roll_arm     stddev_roll_arm        var_roll_arm 
##               15374               15374               15374               15374 
##       avg_pitch_arm    stddev_pitch_arm       var_pitch_arm         avg_yaw_arm 
##               15374               15374               15374               15374 
##      stddev_yaw_arm         var_yaw_arm         gyros_arm_x         gyros_arm_y 
##               15374               15374                   0                   0 
##         gyros_arm_z         accel_arm_x         accel_arm_y         accel_arm_z 
##                   0                   0                   0                   0 
##        magnet_arm_x        magnet_arm_y        magnet_arm_z   kurtosis_roll_arm 
##                   0                   0                   0               15435 
##  kurtosis_picth_arm    kurtosis_yaw_arm   skewness_roll_arm  skewness_pitch_arm 
##               15437               15384               15434               15437 
##    skewness_yaw_arm        max_roll_arm       max_picth_arm         max_yaw_arm 
##               15384               15374               15374               15374 
##        min_roll_arm       min_pitch_arm         min_yaw_arm  amplitude_roll_arm 
##               15374               15374               15374               15374 
## amplitude_pitch_arm   amplitude_yaw_arm 
##               15374               15374
missingforearmVariable <- sapply(select(dataFrameTrainLoad,
                              names(dataFrameTrainLoad)[grepl("_forearm",names(dataFrameTrainLoad))]),
                       function(x) sum(is.na(x)))
missingforearmVariable
##            roll_forearm           pitch_forearm             yaw_forearm 
##                       0                       0                       0 
##   kurtosis_roll_forearm  kurtosis_picth_forearm    kurtosis_yaw_forearm 
##                   15443                   15444                   15699 
##   skewness_roll_forearm  skewness_pitch_forearm    skewness_yaw_forearm 
##                   15442                   15444                   15699 
##        max_roll_forearm       max_picth_forearm         max_yaw_forearm 
##                   15374                   15374                   15443 
##        min_roll_forearm       min_pitch_forearm         min_yaw_forearm 
##                   15374                   15374                   15443 
##  amplitude_roll_forearm amplitude_pitch_forearm   amplitude_yaw_forearm 
##                   15374                   15374                   15443 
##     total_accel_forearm       var_accel_forearm        avg_roll_forearm 
##                       0                   15374                   15374 
##     stddev_roll_forearm        var_roll_forearm       avg_pitch_forearm 
##                   15374                   15374                   15374 
##    stddev_pitch_forearm       var_pitch_forearm         avg_yaw_forearm 
##                   15374                   15374                   15374 
##      stddev_yaw_forearm         var_yaw_forearm         gyros_forearm_x 
##                   15374                   15374                       0 
##         gyros_forearm_y         gyros_forearm_z         accel_forearm_x 
##                       0                       0                       0 
##         accel_forearm_y         accel_forearm_z        magnet_forearm_x 
##                       0                       0                       0 
##        magnet_forearm_y        magnet_forearm_z 
##                       0                       0
missingdumbbellVariable <- sapply(select(dataFrameTrainLoad,
                               names(dataFrameTrainLoad)[grepl("_dumbbell",names(dataFrameTrainLoad))]),
                        function(x) sum(is.na(x)))
missingdumbbellVariable
##            roll_dumbbell           pitch_dumbbell             yaw_dumbbell 
##                        0                        0                        0 
##   kurtosis_roll_dumbbell  kurtosis_picth_dumbbell    kurtosis_yaw_dumbbell 
##                    15379                    15376                    15699 
##   skewness_roll_dumbbell  skewness_pitch_dumbbell    skewness_yaw_dumbbell 
##                    15378                    15375                    15699 
##        max_roll_dumbbell       max_picth_dumbbell         max_yaw_dumbbell 
##                    15374                    15374                    15379 
##        min_roll_dumbbell       min_pitch_dumbbell         min_yaw_dumbbell 
##                    15374                    15374                    15379 
##  amplitude_roll_dumbbell amplitude_pitch_dumbbell   amplitude_yaw_dumbbell 
##                    15374                    15374                    15379 
##     total_accel_dumbbell       var_accel_dumbbell        avg_roll_dumbbell 
##                        0                    15374                    15374 
##     stddev_roll_dumbbell        var_roll_dumbbell       avg_pitch_dumbbell 
##                    15374                    15374                    15374 
##    stddev_pitch_dumbbell       var_pitch_dumbbell         avg_yaw_dumbbell 
##                    15374                    15374                    15374 
##      stddev_yaw_dumbbell         var_yaw_dumbbell         gyros_dumbbell_x 
##                    15374                    15374                        0 
##         gyros_dumbbell_y         gyros_dumbbell_z         accel_dumbbell_x 
##                        0                        0                        0 
##         accel_dumbbell_y         accel_dumbbell_z        magnet_dumbbell_x 
##                        0                        0                        0 
##        magnet_dumbbell_y        magnet_dumbbell_z 
##                        0                        0
dropColumn2Variable <- c(names(missingBeltVariable[missingBeltVariable != 0]), 
                  names(missingArmVariable[missingArmVariable != 0]),
                  names(missingforearmVariable[missingforearmVariable != 0]),
                  names(missingdumbbellVariable[missingdumbbellVariable != 0]))
length(dropColumn2Variable)
## [1] 100
differenceAnalizeVariable <- tbl_df(dataFrameTrainLoad %>% 
                      select(-dropColumn2Variable,
                             -c(X,user_name, raw_timestamp_part_1, 
                                raw_timestamp_part_2, cvtd_timestamp, 
                                new_window,num_window)))
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(dropColumn2Variable)` instead of `dropColumn2Variable` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
differenceAnalizeVariable$classe <- as.factor(differenceAnalizeVariable$classe)
differenceAnalizeVariable[,1:52] <- lapply(differenceAnalizeVariable[,1:52],as.numeric)
dim(differenceAnalizeVariable)
## [1] 15699    53
correlation_columnVariable <- cor(select(differenceAnalizeVariable, -classe))
diag(correlation_columnVariable) <- 0
correlation_columnVariable <- which(abs(correlation_columnVariable)>0.8,arr.ind = T)
correlation_columnVariable <- unique(row.names(correlation_columnVariable))
corrplot(cor(select(differenceAnalizeVariable,correlation_columnVariable)),
         type="upper", order="hclust",method = "number")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(correlation_columnVariable)` instead of `correlation_columnVariable` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

correlation_functionDifferenceVariable <- differenceAnalizeVariable %>% binarize(n_bins = 4, thresh_infreq = 0.01)
correlation_Variablea <- correlation_functionDifferenceVariable %>% correlate(target = classe__A) 
correlation_Variablea %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
correlation_Variableb <- correlation_functionDifferenceVariable %>% correlate(target = classe__B)
correlation_Variableb %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))

Correlation variable number c

correlation_Variablec <- correlation_functionDifferenceVariable %>% correlate(target = classe__C)
correlation_Variablec %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))

Temporary variables number c

Temporary variable number d

correaltion_Variabled <- correlation_functionDifferenceVariable %>% correlate(target = classe__D)
correaltion_Variabled %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))

Temporary Variables

correlation_Variablee <- correlation_functionDifferenceVariable %>% correlate(target = classe__E)
correlation_Variablee %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
columnVariablea <- c("magnet_arm_x", "pitch_forearm" , "magnet_dumbbell_y", 
           "roll_forearm", "gyros_dumbbell_y") 
columnVariableb <- c("magnet_dumbbell_y", "magnet_dumbbell_x" , "roll_dumbbell" , 
           "magnet_belt_y" , "accel_dumbbell_x" )
columnVariablec <- c("magnet_dumbbell_y", "roll_dumbbell" , "accel_dumbbell_y" , 
           "magnet_dumbbell_x", "magnet_dumbbell_z")
columnVariabled <- c("pitch_forearm" , "magnet_arm_y" , "magnet_forearm_x",
           "accel_dumbbell_y", "accel_forearm_x")
columnVariablee <- c("magnet_belt_y" , "magnet_belt_z" , "roll_belt", 
           "gyros_belt_z" , "magnet_dumbbell_y")
finalColumnVariablee <- character()
for(c in c(columnVariablea,columnVariableb,columnVariablec,columnVariabled,columnVariablee)){
  finalColumnVariablee <- union(finalColumnVariablee, c)
}
dataFrameAnalize2Variable <- differenceAnalizeVariable %>% select(finalColumnVariablee, classe)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(finalColumnVariablee)` instead of `finalColumnVariablee` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data.frame("arm" = sum(grepl("_arm",finalColumnVariablee)), 
           "forearm" = sum(grepl("_forearm",finalColumnVariablee)),
           "belt" = sum(grepl("_belt",finalColumnVariablee)),
           "dumbbell" = sum(grepl("_dumbbell",finalColumnVariablee)))
##   arm forearm belt dumbbell
## 1   2       4    4        7

Ploting density and point variables

my_densityVariable <- function(data, mapping, ...) {
  ggplot(data = data, mapping=mapping) +
    geom_density(..., alpha = 0.3)+scale_fill_brewer(palette="Set2") 
}
my_pointVariable <- function(data, mapping, ...) {
  ggplot(data = data, mapping=mapping) +
    geom_point(..., alpha = 0.1)+ scale_fill_brewer(palette="Set2") 
}
ggpairs(dataFrameAnalize2Variable, columns = 1:5,aes(color = classe),
        lower = list(continuous = my_pointVariable),diag = list(continuous = my_densityVariable))

Ploting some data

ggpairs(dataFrameAnalize2Variable, columns = 6:10,aes(color = classe),
        lower = list(continuous = my_pointVariable),diag = list(continuous = my_densityVariable))

ggpairs(dataFrameAnalize2Variable, columns = 11:17,aes(color = classe),
        lower = list(continuous = my_pointVariable),diag = list(continuous = my_densityVariable))

dfTrainF <- dataFrameTrainLoad %>% select(finalColumnVariablee,classe)
dfValF <- valueOfDifferent %>% select(finalColumnVariablee,classe)
dfTrainF[,1:17] <- sapply(dfTrainF[,1:17],as.numeric)
dfValF[,1:17] <- sapply(dfValF[,1:17],as.numeric)
levels <- c("A", "B", "C", "D", "E")
preprop_obj <- preProcess(dfTrainF[,-18],method = c("center","scale","BoxCox"))
xTrain <- predict(preprop_obj,select(dfTrainF,-classe))
yTrain <- factor(dfTrainF$classe,levels=levels)
xVal <- predict(preprop_obj,select(dfValF,-classe))
yVal <- factor(dfValF$classe,levels=levels)
trControl <- trainControl(method="cv", number=5)
modelCT <- train(x = xTrain,y = yTrain, 
                 method = "rpart", trControl = trControl)
modelRF <- train(x = xTrain,y = yTrain, 
                 method = "rf", trControl = trControl,verbose=FALSE, metric = "Accuracy")
modelGBM <- train(x = xTrain,y = yTrain, 
                  method = "gbm",trControl=trControl, verbose=FALSE)
modelSVM <- svm(x = xTrain,y = yTrain,
                kernel = "polynomial", cost = 10)
confusionMatrix(predict(modelCT,xVal),yVal)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1026  307  337  284  100
##          B   17  248   22  120   91
##          C   71  204  325  239  193
##          D    0    0    0    0    0
##          E    2    0    0    0  337
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4935          
##                  95% CI : (0.4777, 0.5093)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3377          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9194  0.32675  0.47515   0.0000  0.46741
## Specificity            0.6338  0.92099  0.78172   1.0000  0.99938
## Pos Pred Value         0.4995  0.49799  0.31492      NaN  0.99410
## Neg Pred Value         0.9518  0.85080  0.87582   0.8361  0.89286
## Prevalence             0.2845  0.19347  0.17436   0.1639  0.18379
## Detection Rate         0.2615  0.06322  0.08284   0.0000  0.08590
## Detection Prevalence   0.5236  0.12694  0.26306   0.0000  0.08641
## Balanced Accuracy      0.7766  0.62387  0.62843   0.5000  0.73339
confusionMatrix(predict(modelRF,xVal),yVal)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1113    5    0    0    0
##          B    0  741    3    2    0
##          C    3   12  679   10    0
##          D    0    1    2  631    2
##          E    0    0    0    0  719
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9898          
##                  95% CI : (0.9861, 0.9927)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9871          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9973   0.9763   0.9927   0.9813   0.9972
## Specificity            0.9982   0.9984   0.9923   0.9985   1.0000
## Pos Pred Value         0.9955   0.9933   0.9645   0.9921   1.0000
## Neg Pred Value         0.9989   0.9943   0.9984   0.9963   0.9994
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2837   0.1889   0.1731   0.1608   0.1833
## Detection Prevalence   0.2850   0.1902   0.1795   0.1621   0.1833
## Balanced Accuracy      0.9978   0.9874   0.9925   0.9899   0.9986

Ploting error vs numbet of tree

plot(modelRF$finalModel,main="Error VS no of tree")

confusion matrix and overall statistics

confusionMatrix(predict(modelGBM,xVal),yVal)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1090   23    1    2    4
##          B    9  643   35   19   12
##          C   13   58  632   56   15
##          D    4   35   16  562   14
##          E    0    0    0    4  676
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9184          
##                  95% CI : (0.9094, 0.9268)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8968          
##                                           
##  Mcnemar's Test P-Value : 2.912e-14       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9767   0.8472   0.9240   0.8740   0.9376
## Specificity            0.9893   0.9763   0.9562   0.9790   0.9988
## Pos Pred Value         0.9732   0.8955   0.8165   0.8906   0.9941
## Neg Pred Value         0.9907   0.9638   0.9835   0.9754   0.9861
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2778   0.1639   0.1611   0.1433   0.1723
## Detection Prevalence   0.2855   0.1830   0.1973   0.1608   0.1733
## Balanced Accuracy      0.9830   0.9117   0.9401   0.9265   0.9682

Prediction ,confusion Matrix

confusionMatrix(predict(modelSVM,xVal),yVal)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1106   44   20   21    2
##          B    1  668    9    2    3
##          C    6   40  645   45    6
##          D    3    6   10  575   16
##          E    0    1    0    0  694
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9401          
##                  95% CI : (0.9322, 0.9473)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9241          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9910   0.8801   0.9430   0.8942   0.9626
## Specificity            0.9690   0.9953   0.9701   0.9893   0.9997
## Pos Pred Value         0.9271   0.9780   0.8693   0.9426   0.9986
## Neg Pred Value         0.9963   0.9719   0.9877   0.9795   0.9916
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2819   0.1703   0.1644   0.1466   0.1769
## Detection Prevalence   0.3041   0.1741   0.1891   0.1555   0.1772
## Balanced Accuracy      0.9800   0.9377   0.9565   0.9418   0.9811

Getting Result

dfTest2 <- dataFrameTestLoad %>% select(finalColumnVariablee,problem_id)
xTest <- dfTest2 %>% select(finalColumnVariablee)
  
resultvaiable <- data.frame("problem_id" = dataFrameTestLoad$problem_id,
                     "PREDICTION_RF" = predict(modelRF,xTest),
                     "PREDICTION_GBM" = predict(modelGBM,xTest),
                     "PREDICTION_SVM" = predict(modelSVM,xTest))
resultvaiable
##    problem_id PREDICTION_RF PREDICTION_GBM PREDICTION_SVM
## 1           1             E              E              C
## 2           2             A              E              A
## 3           3             A              E              B
## 4           4             E              E              A
## 5           5             A              E              A
## 6           6             E              D              A
## 7           7             E              E              B
## 8           8             B              D              B
## 9           9             A              D              E
## 10         10             E              E              E
## 11         11             A              E              B
## 12         12             A              D              A
## 13         13             B              E              E
## 14         14             A              D              B
## 15         15             E              E              B
## 16         16             E              E              A
## 17         17             E              E              C
## 18         18             B              E              A
## 19         19             E              E              A
## 20         20             E              E              D